#!/bin/bash
# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

############################################################################
## This script generates custom operations needed by some of the
## Triton's CI tests. Generating these custom operations requires
## using the TensorFlow container.
##
## 1. Update TENSORFLOW_IMAGE and PYTORCH_IMAGE to match what is being
## used by the Triton release being tested.
##
## 2. Run this script to create /tmp/qa_custom_ops.
##
############################################################################

TRITON_VERSION=21.07
TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$TRITON_VERSION-tf1-py3}
PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$TRITON_VERSION-py3}

CUDA_DEVICE=0

###
HOST_SRCDIR=/tmp/gen_srcdir
HOST_DESTDIR=/tmp/$TRITON_VERSION/qa_custom_ops

rm -fr $HOST_DESTDIR
mkdir -p $HOST_DESTDIR/tf_custom_ops $HOST_DESTDIR/libtorch_custom_ops

rm -fr $HOST_SRCDIR
mkdir -p $HOST_SRCDIR

cp ./gen_qa_custom_ops_models.py $HOST_SRCDIR/.
cp ./cuda_op_kernel.cu.cc.patch $HOST_SRCDIR/.
cp ./busy_op_kernel.* $HOST_SRCDIR/.
TFSCRIPT=tf_gen.cmds
PYTSCRIPT=pyt_gen.cmds

SRCDIR=/tmp/src
DESTDIR=/tmp/custom_ops

# Tensorflow
cat >$HOST_SRCDIR/$TFSCRIPT <<EOF
#!/bin/bash -x
set -e

TF_CFLAGS=\$(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))')
TF_LFLAGS=\$(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')

# No CUDA
cd /tmp
cp /opt/tensorflow/tensorflow-source/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc .
g++ -std=c++11 -O2 -shared -fPIC zero_out_op_kernel_1.cc -o $DESTDIR/libzeroout.so \${TF_CFLAGS[@]} \${TF_LFLAGS[@]}

# CUDA. Need to patch so that we can build it outside of bazel/TF
cp /opt/tensorflow/tensorflow-source/tensorflow/examples/adding_an_op/cuda_op_kernel.cc .
cp /opt/tensorflow/tensorflow-source/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc .
patch -i $SRCDIR/cuda_op_kernel.cu.cc.patch cuda_op_kernel.cu.cc
nvcc -std=c++11 -O2 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
g++ -std=c++11 -shared -o $DESTDIR/libcudaop.so cuda_op_kernel.cc cuda_op_kernel.cu.o \${TF_CFLAGS[@]} -fPIC -L/usr/local/cuda/lib64 -lcudart \${TF_LFLAGS[@]}

cp $SRCDIR/busy_op_kernel.cc .
cp $SRCDIR/busy_op_kernel.cu.cc .
nvcc -std=c++11 -O2 -c -o busy_op_kernel.cu.o busy_op_kernel.cu.cc \${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
g++ -std=c++11 -shared -o $DESTDIR/libbusyop.so busy_op_kernel.cc busy_op_kernel.cu.o \${TF_CFLAGS[@]} -fPIC -L/usr/local/cuda/lib64 -lcudart \${TF_LFLAGS[@]}

python3 $SRCDIR/gen_qa_custom_ops_models.py --graphdef --savedmodel \
    --models_dir=$DESTDIR --zero_out_lib_path=$DESTDIR/libzeroout.so \
    --cuda_op_lib_path=$DESTDIR/libcudaop.so \
    --busy_op_lib_path=$DESTDIR/libbusyop.so
chmod -R 777 $DESTDIR
EOF

chmod a+x $HOST_SRCDIR/$TFSCRIPT
if [ $? -ne 0 ]; then
    echo -e "Failed: chmod"
    exit 1
fi

docker pull $TENSORFLOW_IMAGE
docker run --gpus device=$CUDA_DEVICE --rm --entrypoint $SRCDIR/$TFSCRIPT \
       --mount type=bind,source=$HOST_SRCDIR,target=$SRCDIR \
       --mount type=bind,source=$HOST_DESTDIR/tf_custom_ops,target=$DESTDIR \
       $TENSORFLOW_IMAGE
if [ $? -ne 0 ]; then
    echo -e "Failed"
    exit 1
fi

# PyTorch
cat >$HOST_SRCDIR/$PYTSCRIPT <<EOF
#!/bin/bash -x
set -e
python3 $SRCDIR/gen_qa_custom_ops_models.py --libtorch --models_dir=$DESTDIR
cp /root/.cache/torch_extensions/custom_modulo/custom_modulo.so $DESTDIR/libtorch_modulo/.
chmod -R 777 $DESTDIR
EOF

chmod a+x $HOST_SRCDIR/$PYTSCRIPT
if [ $? -ne 0 ]; then
    echo -e "Failed: chmod"
    exit 1
fi

docker pull $PYTORCH_IMAGE
docker run --gpus device=$CUDA_DEVICE --rm --entrypoint $SRCDIR/$PYTSCRIPT \
       --mount type=bind,source=$HOST_SRCDIR,target=$SRCDIR \
       --mount type=bind,source=$HOST_DESTDIR/libtorch_custom_ops,target=$DESTDIR \
       $PYTORCH_IMAGE
if [ $? -ne 0 ]; then
    echo -e "Failed"
    exit 1
fi
